load libs

library(DBI)
library(broom)
library(splines)
library(stringr)
library(dplyr)
library(ggplot2)

Load exploration

source("../6_nhanes_data/phesant.R")

exposure_vars <-
  read.delim("../data/select_vars.txt", header = FALSE)$V1
exposure_cols <- paste(exposure_vars, collapse = ", ")


# Load data and convert data types according to PHESEANT.
# Return data set and PHESEANT results.
load_data <- function(exposure_cols) {
  nhanes_db <- dbConnect(RSQLite::SQLite(), "../nhanes.sqlite")
  
  cols <-
    'BMXWAIST, RIDAGEYR, BMXHT, BMXBMI, BMXWT, RIAGENDR,SDMVPSU,SDMVSTRA, years,'
  data_sql <-
    paste('SELECT', cols, exposure_cols, 'FROM merged_table')
  
  data <- dbGetQuery(nhanes_db, data_sql)
  data <- na.omit(data)
  dbDisconnect(nhanes_db)
  
  phs_res <- phesant(data)
  
  data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])] <-
    lapply(data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])], as.factor)
  
  data[, names(phs_res[phs_res == "continuous"])] <-
    lapply(data[, names(phs_res[phs_res == "continuous"])], as.numeric)
  
  return(list(data = data, phs_res = phs_res))
  
}

data_phs <- load_data(exposure_cols)

data <- data_phs$data
phs_res <- data_phs$phs_res

Inverse Normal Distribution and build formula

invNorm <- function(x) {qnorm((rank(x) - 3/8)/(length(x) +1 - 6/8))}
min_max_norm <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}


build_formula <- function(base_model,exposure_var,inv=FALSE) {
  var <- "+ %s"
  if(inv) var <- " + invNorm(%s)" 
  
  as.formula(
    sprintf(paste0(base_model, var),
      exposure_var
    )
  )
}

EnWAS

enwas <-
  function(base_model,
           exposure_vars,
           data_set,
           inv_norm = FALSE) {
    num_var <- length(exposure_vars)
    
    model_list <- vector(mode = "list", length = num_var)
    num_cols <- sapply(data_set[, exposure_vars], is.numeric)
    num_cols <- names(num_cols[num_cols == TRUE])
    if (inv_norm) {
      data_set[, num_cols] <- lapply(data_set[, num_cols], invNorm)
    }
    
    
    
    
    association_list <- data.frame()
    factor_terms <- c() # to hold the factors terms
    num_var <- length(exposure_vars)
    for (i in 1:num_var) {
      exposure <- exposure_vars[i]
      
      if (is.factor(data_set[, exposure])) {
        factor_terms <-
          c(factor_terms, paste0(exposure, levels(data_set[, exposure])))
      }
      
      model <- build_formula(base_model, exposure)
      
      mod <- lm(model, data_set)
      model_list[[i]] <- mod
      mod_df <- tidy(mod)
      association_list <-
        rbind(association_list, mod_df) # step 3b of algorithm
    }
    
    
    xwas_list <-
      association_list[association_list$term %in% c(exposure_vars, factor_terms), ]

    
    # sd_x_list <-  sapply(data_set[,num_cols],sd)
    sd_x_list <-  sapply(data_set, function(x)
      sd(as.numeric(x)))
    for (var in exposure_vars) {
      xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] <-
        xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] * sd_x_list[var]
    }
    
    
    
    xwas_list$fdr <-
      p.adjust(xwas_list$p.value, method = 'BY')
    
    
    
    xwas_list <- xwas_list |>
      mutate(lower = estimate - 1.96 * std.error)  |>
      mutate(upper = estimate + 1.96 * std.error)
    
    return (list(model_list = model_list, enwas_res = xwas_list))
    
  }




forest_plot <- function(xwas_result) {
  xwas_result |>
    ggplot(aes(
      x = term,
      y = estimate,
      ymin = lower,
      ymax = upper,
      colour = estimate
    )) +
    geom_pointrange() +
    coord_flip() +  # flip coordinates (puts labels on y axis)
    xlab("Exposures") + ylab("Estimate (95% CI)") +
    theme_bw()  # use a white background
}


forest_plot_mult <- function(xwas_result_list) {
  xwas_result <- do.call("rbind", xwas_result_list)
  xwas_result$EnWAS <-
    rep(names(xwas_result_list), each = nrow(xwas_result_list[[1]]))
  
  xwas_result |>
    ggplot(aes(
      x = term,
      y = estimate,
      ymin = lower,
      ymax = upper,
      colour = EnWAS
    )) +
    geom_pointrange(alpha = 0.4) +
    coord_flip() +  # flip coordinates (puts labels on y axis)
    xlab("Exposures") + ylab("Estimate (95% CI)") +
    theme_bw()  # use a white background
}

Run EnWAS

linear_model <- 'BMXWAIST ~ RIDAGEYR*RIAGENDR + BMXBMI'
linear_res2 <- enwas(linear_model, exposure_vars, data)



all_ns_model <-
  'BMXWAIST ~ ns(RIDAGEYR, knots = seq(30, 80, by = 10)) * RIAGENDR + ns(BMXBMI,knots = seq(30, 80, by = 10))'
all_ns_res <- enwas(all_ns_model, exposure_vars, data)


lm_inverse_res <- enwas(linear_model, exposure_vars, data,inv = TRUE)
ns_inverse_res <- enwas(all_ns_model, exposure_vars, data,inv = TRUE)

Run EnWAS

forest_plot(linear_res2$enwas_res)

forest_plot_mult(
  list(
    linear = linear_res2$enwas_res,
    ns = all_ns_res$enwas_res
  )
)

forest_plot_mult(
  list(
    lm_inv = lm_inverse_res$enwas_res,
    ns_inv = ns_inverse_res$enwas_res
  )
)

forest_plot_mult(
  list(
    linear = linear_res2$enwas_res,
    lm_inv = lm_inverse_res$enwas_res
  )
)

forest_plot_mult(
  list(
    ns = all_ns_res$enwas_res,
    ns_inv = ns_inverse_res$enwas_res
  )
)

EnWAS Results numerical only

num_cols <- sapply(data[, exposure_vars], is.numeric)
num_cols <- names(num_cols[num_cols == TRUE])


linear <- linear_res2$enwas_res
linear <- linear[linear$term %in% num_cols,]
ns <- all_ns_res$enwas_res
ns <- ns[ns$term %in% num_cols,]
    
lm_inv <- lm_inverse_res$enwas_res
lm_inv <- lm_inv[lm_inv$term %in% num_cols,]
ns_inv <- ns_inverse_res$enwas_res
ns_inv <- ns_inv[ns_inv$term %in% num_cols,]

forest_plot_mult(
  list(
    ns = ns,
    linear = linear
  )
)

forest_plot_mult(
  list(
    lm_inv = lm_inv,
    ns_inv = ns_inv
  )
)

forest_plot_mult(
  list(
    linear = linear,
    lm_inv = lm_inv
  )
)

forest_plot_mult(
  list(
    ns = ns,
    ns_inv = ns_inv
  )
)

EnWAS show interesting variables

xwas_res_list <- list(linear = linear,ns = ns,lm_inv = lm_inv,ns_inv = ns_inv)
xwas_result <- do.call("rbind",xwas_res_list )
xwas_result$EnWAS <-
  rep(names(xwas_res_list), each = nrow(xwas_res_list[[1]]))

xwas_result$postive <- xwas_result$lower * xwas_result$upper
xwas_result$postive <- xwas_result$postive >=0

# show the false positive raised in linear base model
ns_vs_linear <- xwas_result[xwas_result$EnWAS=="linear",]$postive != xwas_result[xwas_result$EnWAS=="ns",]$postive
num_cols[ns_vs_linear]

[1] “DR1EXMER” “DR1TM201”

for(c in num_cols[ns_vs_linear]){
  g <- ggplot(data, aes_string(x="RIDAGEYR", y=paste0("invNorm(",c,")"), colour="RIAGENDR")) + 
  geom_point(alpha=0.1)+
  geom_smooth(method = lm, formula = y ~ ns(x, knots = seq(30, 80, by = 10)), size = 1)
  print(g)
}

# ns vs inverse norm transformation 
ns_vs_ns_inv <- xwas_result[xwas_result$EnWAS=="ns_inv",]$postive != xwas_result[xwas_result$EnWAS=="ns",]$postive
num_cols[ns_vs_ns_inv]

[1] “DR1EXMER” “DR1TPROT” “DR1TCRYP” “DR1TNIAC” “DR1TVK” “DR1TSELE”

for(c in num_cols[ns_vs_ns_inv]){
  g <- ggplot(data, aes_string(x="RIDAGEYR", y=paste0("invNorm(",c,")"), colour="RIAGENDR")) + 
  geom_point(alpha=0.1)+
  geom_smooth(method = lm, formula = y ~ ns(x, knots = seq(30, 80, by = 10)), size = 1)
  print(g)
}